/*
Produces the quarterly drug panel used in the core anti-cholesterol analysis

Aggregate choices up to the person x year level:
- ID
- Previous history/new user
- Choice this year


Step 1: use Redbook to classify drugs into indications


Step 2: create information on all enrolled people


Step 3: process the raw claims data
*/







/*
Create indication data based on Redbook and Cortellis
*/


clear 
use "redbook"

keep ndcnum gennme maintin maintds thergrp thrgrds thercls thrclds prodnme mstfmds

// generate some additional markers
tab mstfmds
gen xr = strpos(mstfmds, "Extended Release") > 0

// for later compression (using earlier map)
merge m:1 gennme using generic_name_mapping
keep if _merge == 3
drop _merge

replace gennme = upper(gennme)
replace prodnme = upper(prodnme)

merge m:1 gennme using "gennme_lists_by_indication"
keep if _merge == 1 | _merge == 3
drop _merge

rename indication indication_temp

merge m:1 prodnme using "prodnme_lists_by_indication"
keep if _merge == 1 | _merge == 3
drop _merge

// MODIFIED: apply patch to account for pravachol
// Pravastatin Sodium
replace indication = "Hypercholesterolemia" if gennme == "PRAVASTATIN SODIUM" | gennme == "CERIVASTATIN SODIUM"



replace indication = indication_temp if indication == ""
drop if indication == ""


preserve
keep indication
duplicates drop
gen indic_id = _n
save indication_id_map, replace
restore


merge m:1 indication using indication_id_map
keep if _merge == 3
drop _merge

keep ndcnum generic_id indic_id xr

compress


save compressed_redbook_w_indications, replace






/*
2) Create basic information for all individuals
*/

clear
gen enrolid = .
gen year = .
save all_enrolid_info, replace



// check ID consistency across years
set more off
forvalues year = 1996/2013 {

	clear
	use enrolid dobyr sex indstry using ccaea`year' 

	gen year = `year'
	
	append using all_enrolid_info
	save all_enrolid_info, replace
}

collapse (min) start_year = year (max) end_year = year (first) dobyr sex indstry, by(enrolid)

compress
save all_enrolid_info, replace






///////////
// SETUP //
///////////
clear
set more off
use enrolid genind ndcnum svcdate using ccaed1996

// filter data: this has changed
merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
keep if indic_id == 5
drop _merge

// start year necessary for generating first use indicator (relevant for reduced form)
merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
drop _merge



forvalues j=1997/2013 {
	append using ccaed`j', keep(enrolid genind ndcnum svcdate)
	
	drop generic_id indic_id xr start_year
	
	merge m:1 ndcnum using "compressed_redbook_w_indications", keepus(generic_id indic_id xr) keep(mat)
	keep if indic_id == 5
	drop _merge
	
	// filter data (keep size down)
	merge m:1 enrolid using "all_enrolid_info", keep(mat) keepus(start_year)
	drop _merge
	
}

keep if indic_id == 5


// panel has 15 million people (2.5 million in 10 year panel)
// data does match people over years


// generics: genind (type of situation and generic vs. brand)
destring genind, replace force
keep if genind >= 1 & genind <= 5

gen generic = (genind == 4) | (genind == 5)
gen generic_available = (genind == 3)


// tack on Redbook information to get ingredient name
// get things like: indic_id, generic name, extended release version (indication from Cortellis)
drop genind


///////////////
// END SETUP //
///////////////

// figure out when a user first starts using anything in the class
sort enrolid indic_id svcdate generic // go with brand if multiple of the same day
egen first_use = tag(enrolid indic_id)

// label new subscription (first and six months after the start of the sample)
gen raw_first_use = first_use
replace first_use = 0 if first_use == 1 & svcdate < mdy(6,1,start_year) // MODIFIED


preserve

keep if first_use == 1
keep enrolid svcdate
rename svcdate first_use_date

save user_first_cholesterol_dates, replace

restore

preserve

keep if raw_first_use == 1 
keep enrolid svcdate first_use raw_first_use
rename svcdate first_use_date

save user_raw_first_cholesterol_dates, replace

restore



gen year = year(svcdate)
gen quarter = qofd(svcdate)
tab quarter

gen prescriptions = 1
egen drugid = group(generic_id generic xr) // a drug is ingredient x generic x extended release

// Crude: get drug with most prescriptions in a given year
// 1 entry per enrollee, year, and indication
// additional variables: first year using drugs in some area (allows us to see if drug used in entry year)
collapse (sum) prescriptions (max) first_use (first) generic_id generic xr year, by(enrolid indic_id quarter drugid)


// patch for dealing with multiple prescriptions (still want to record that this quarter was the first
egen first_use_temp = max(first_use), by(enrolid indic_id quarter)
replace first_use = first_use_temp
drop first_use_temp

// problematic step: will drop first use if first use involves two drugs
gsort enrolid indic_id quarter - prescriptions drugid // drugid for multiple prescription cases (need to clean this up later)

egen t = tag(enrolid indic_id quarter)
keep if t == 1 // keep top prescription for the quarter: drop entries indicate possible combination therapy
drop t



sort enrolid indic_id quarter


save user_quarter_choice_cholesterol, replace 









